1 Introduction

This report examines the impact of major disruptions—including the COVID-19 pandemic, significant public events, and extreme weather—on train usage patterns in Sydney between 2020 and 2025. Using Opal card data and Bureau of Meteorology (BOM) records, we analyze changes in passenger volumes, identify key trends, and compare the magnitude and duration of different types of disruptions.


2 COVID-19 Impact on Sydney Train Usage

Below, we analyze the impact of COVID-19 on Sydney train station usage using Opal data. The code and analysis are directly adapted from T1.qmd, with each code block preceded by a description of its purpose.

# DATA PREP

# ignore closed stations
stations_to_ignore <- c("Rosehill", "Camellia", "Rydalmere", "Dundas", "Telopea", "Carlingford")

ee <- ee %>%
  filter(Station_Type %in% c("Train", "Metro Shared")) %>%
  mutate(
    Train_Station = gsub(" Station", "", Station),
    Train_Station = gsub(" $", "", Train_Station),
    TripNumber = as.numeric(ifelse(Trip == "Less than 50", 50, Trip)),
    MonthYear = as.Date(paste0(MonthYear, "-01"))
  ) %>%
  filter(!Train_Station %in% stations_to_ignore) %>%
  mutate(
    Phase = case_when(
      MonthYear >= as.Date("2020-01-01") & MonthYear <= as.Date("2021-06-30") ~ "Early COVID",
      MonthYear >= as.Date("2021-07-01") & MonthYear <= as.Date("2021-12-31") ~ "Delta COVID",
      MonthYear >= as.Date("2022-01-01") ~ "Post COVID",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(Phase))

# load and merge station coordinates
stations <- stations %>%
  filter(!duplicated(Train_Station)) %>%
  filter(Train_Station %in% ee$Train_Station)

ee <- ee %>%
  left_join(stations %>% 
  select(Train_Station, LAT, LONG), by = "Train_Station") %>%
  filter(!is.na(LAT) & !is.na(LONG))

ee_sf <- st_as_sf(ee, coords = c("LONG", "LAT"), crs = 4326)

# load route shapefile
trains <- st_read(shp_path, quiet = TRUE) %>% st_transform(crs = 4326)

# summarize
station_phase_totals <- ee_sf %>%
  st_drop_geometry() %>%
  group_by(Train_Station, Phase) %>%
  summarise(
    Total_Entries = sum(TripNumber[Entry_Exit == "Entry"], na.rm = TRUE),
    Total_Exits   = sum(TripNumber[Entry_Exit == "Exit"], na.rm = TRUE),
    .groups = "drop"
  )

# % drop and recovery + AVG
station_trends <- station_phase_totals %>%
  pivot_wider(
    names_from = Phase,
    values_from = c(Total_Entries, Total_Exits),
    names_glue = "{.value}_{gsub(' ', '_', Phase)}"
  ) %>%
  mutate(
    Entry_Drop_Pct = round((Total_Entries_Early_COVID - Total_Entries_Delta_COVID) / Total_Entries_Early_COVID * 100, 1),
    Entry_Recovery_Pct = round((Total_Entries_Post_COVID - Total_Entries_Delta_COVID) / Total_Entries_Delta_COVID * 100, 1),
    Exit_Drop_Pct = round((Total_Exits_Early_COVID - Total_Exits_Delta_COVID) / Total_Exits_Early_COVID * 100, 1),
    Exit_Recovery_Pct = round((Total_Exits_Post_COVID - Total_Exits_Delta_COVID) / Total_Exits_Delta_COVID * 100, 1),
    Avg_Drop_Pct = round((Entry_Drop_Pct + Exit_Drop_Pct) / 2, 1),
    Avg_Recovery_Pct = round((Entry_Recovery_Pct + Exit_Recovery_Pct) / 2, 1)
  )

2.1 Interactive Visual for Entry / Exits

This code generates a small bar chart for each station (as a tooltip), showing entries and exits by phase, and combines this with drop/recovery statistics for use in an interactive map.

# embedded bar chart tooltip
create_station_plot_base64 <- function(station_name) {
  station_data <- station_phase_totals %>% filter(Train_Station == station_name)
  station_data$Phase <- factor(station_data$Phase, levels = c("Early COVID", "Delta COVID", "Post COVID"))

  long_data <- station_data %>%
    pivot_longer(cols = c(Total_Entries, Total_Exits),
                 names_to = "Type", values_to = "Count") %>%
    mutate(Type = ifelse(Type == "Total_Entries", "Entries", "Exits"))

  p <- ggplot(long_data, aes(x = Phase, y = Count, fill = Type)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_text(aes(label = Count), position = position_dodge(0.9), vjust = -0.7, size = 3.2) +
    scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
    labs(title = station_name) +
    theme_minimal() +
    theme(
      legend.position = "top",
      axis.text.x = element_text(angle = 45, hjust = 1),
      plot.margin = margin(10, 10, 10, 10)
    )

  file_path <- tempfile(fileext = ".png")
  ggsave(file_path, plot = p, width = 4, height = 3.5, dpi = 150)
  encoded <- base64enc::dataURI(file = file_path, mime = "image/png")
  return(paste0("<img src='", encoded, "' width='260'>"))
}

# tooltip: embed chart + averaged % drop/recovery info
tooltip_df <- station_trends %>%
  mutate(
    chart_img = sapply(Train_Station, create_station_plot_base64),
    tooltip = paste0(
      chart_img, "<br>",
      "<b>Drop & Recovery:</b><br>",
      "Average Drop (Early → Delta): ", Avg_Drop_Pct, "%<br>",
      "Average Recovery (Delta → Post): ", Avg_Recovery_Pct, "%"
    )
  )

# df with spatial + tooltips
df <- ee_sf %>%
  filter(Entry_Exit == "Entry") %>%
  distinct(Train_Station, .keep_all = TRUE) %>%
  left_join(tooltip_df, by = "Train_Station")

2.2 Create interactive map of train station usage and drop/recovery

This code defines a function to plot all stations on a map, color-coded by route, with interactive tooltips showing drop and recovery statistics and a bar chart for each station. The map supports zooming.

# plot func with zoom
plot_all_phases <- function() {
  trains_proj <- st_transform(trains, crs = 3857)
  sydney_region <- ne_states(country = "Australia", returnclass = "sf") %>% filter(name_en == "New South Wales")
  sydney_proj <- st_transform(sydney_region, crs = 3857)
  df_proj <- st_transform(df, crs = 3857)

  bbox_syd <- st_bbox(c(xmin = 150.5, xmax = 151.35, ymin = -34.15, ymax = -33.35), crs = 4326)
  bbox_proj <- st_transform(st_as_sfc(bbox_syd), crs = 3857)

  gg <- ggplot() +
    geom_sf(data = sydney_proj, fill = "gray95", color = "gray85") +
    geom_sf(data = trains_proj, aes(color = route_shor), size = 0.6) +
    geom_point_interactive(
      data = df_proj,
      aes(geometry = geometry, tooltip = tooltip),
      stat = "sf_coordinates",
      size = 1.4,
      colour = "darkred"
    ) +
    coord_sf(xlim = st_bbox(bbox_proj)[c("xmin", "xmax")],
             ylim = st_bbox(bbox_proj)[c("ymin", "ymax")]) +
    ggtitle("Sydney Train Station Usage – COVID Phases") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16))

  girafe(
    ggobj = gg,
    options = list(
      opts_tooltip(css = "background-color:white; color:#00274D; font-size:10px; padding:5px;"),
      opts_zoom(max = 5)
    )
  )
}

plot_all_phases()

2.3 Plot monthly entry trend for Central Station

This code creates an interactive time series plot of monthly entries at Central Station, highlighting the lowest and highest points.

# prep monthly data
central_monthly <- ee %>%
  filter(Train_Station == "Central", Entry_Exit == "Entry") %>%
  group_by(Month = MonthYear) %>%
  summarise(Entries = sum(TripNumber, na.rm = TRUE), .groups = "drop")

min_point <- central_monthly %>% filter(Entries == min(Entries, na.rm = TRUE))
max_point <- central_monthly %>% filter(Entries == max(Entries, na.rm = TRUE))

# plot
p <- ggplot(central_monthly, aes(x = Month, y = Entries)) +
  geom_line(color = "#800026", size = 1.2, alpha = 0.9) +
  geom_point(aes(text = paste0("Month: ", format(Month, "%b %Y"),
                               "<br>Entries: ", scales::comma(Entries))),
             color = "#800026", size = 1.8) +
  geom_point(data = min_point, aes(x = Month, y = Entries),
             color = "#1f78b4", size = 4) +
  geom_point(data = max_point, aes(x = Month, y = Entries),
             color = "#33a02c", size = 4) +
  geom_text(data = min_point, aes(label = "Lowest", y = Entries - 15000),
            color = "#1f78b4", vjust = 1.5, hjust = 0.5, size = 3.5) +
  geom_text(data = max_point, aes(label = "Highest", y = Entries + 15000),
            color = "#33a02c", vjust = -0.5, hjust = 0.5, size = 3.5) +
  scale_y_continuous(labels = comma) +
  scale_x_date(date_breaks = "3 months", date_labels = "%b\n%Y") +
  labs(
    title = "Central Station – Monthly Entry Trend (2020–2024)",
    x = NULL, y = "Number of Entries"
  ) +
  theme_minimal(base_family = "Arial") +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16, color = "#333333"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
    axis.text.y = element_text(size = 9),
    axis.title.y = element_text(size = 10, face = "bold"),
    panel.grid.minor = element_blank()
  )

ggplotly(p, tooltip = "text")

2.4 Interactive time series for all stations with dropdown

This code creates an interactive time series plot for monthly entries at every station, with a dropdown menu to select the station.

# data for monthly entries by station
ee_station_ts <- ee %>%
  filter(Entry_Exit == "Entry") %>%
  group_by(Train_Station, MonthYear) %>%
  summarise(Total_Entries = sum(TripNumber, na.rm = TRUE), .groups = "drop") %>%
  arrange(Train_Station, MonthYear)

initial_station <- "Central"

# Initial ggplot
p <- ggplot(ee_station_ts %>% filter(Train_Station == initial_station),
            aes(x = MonthYear, y = Total_Entries)) +
  geom_line(color = "#990033", linewidth = 1.2) +
  geom_point(color = "#990033") +
  labs(
       x = "Month", y = "Entries") +
  theme_minimal()

pl <- ggplotly(p)

# y-values only for each station
stations <- unique(ee_station_ts$Train_Station)
station_series <- lapply(stations, function(station) {
  list(ee_station_ts %>% filter(Train_Station == station) %>% pull(Total_Entries))
})

# drop down for y axis
dropdown_buttons <- lapply(seq_along(stations), function(i) {
  list(
    method = "restyle",
    args = list("y", station_series[[i]]),
    label = stations[i]
  )
})

# drop down position
pl <- pl %>%
  layout(
  updatemenus = list(
    list(
      type = "dropdown",
      direction = "down",
      x = 0,
      xanchor = "left",
      y = 1.1,
      yanchor = "top",
      showactive = TRUE,
      buttons = dropdown_buttons
    )
  ),
  xaxis = list(title = "Month"),
  yaxis = list(title = "Entries")
)

pl

3 Public Events and Temporary Increases in Ridership

Events, Festivals, and Spikes

Public events such as Vivid Sydney, New Year’s Eve, and major concerts have a significant but temporary impact on train ridership in Sydney. This section analyzes Opal card data to quantify changes in train usage during these events, with a focus on the Sydney CBD (Circular Quay, Martin Place, Town Hall).

3.1 Vivid Sydney: Impact on Train Usage

Vivid Sydney is an annual festival that draws large crowds to the CBD. We compare average train tap-ons during Vivid Sydney to normal days, both for all-day and evening (7–10pm) periods.

3.1.1 Train: Average Tap-Ons

# DATA PREP

opal_selected <- opal_selected %>%
  mutate(date = as.Date(trip_origin_date), year = as.integer(format(date, "%Y")))

# each year vivid period
vivid_days <- function(year) {
  seq(as.Date(paste0(year, "-05-22")), as.Date(paste0(year, "-06-14")), by = "day")
}
years <- sort(unique(opal_selected$year))

# all vivid dates for all years
vivid_dates_list <- lapply(years, vivid_days)
names(vivid_dates_list) <- years
vivid_dates <- unlist(vivid_dates_list)
vivid_dates <- vivid_dates[!is.na(vivid_dates)]

# exclude dates for normal days
exclude_dates <- unique(c(
  vivid_dates,
  as.Date(c("2024-02-23", "2024-02-24", "2024-02-25", "2024-02-26", "2024-10-22", "2024-10-23",
            "2024-01-01", "2024-01-26", "2024-03-29", "2024-03-31", "2024-04-01", "2024-04-25", "2024-12-25", "2024-12-26"))
))

tap_bin_map <- c("<50"=25,"50-100"=75,"100-200"=150,"200-400"=300,"400-800"=600,"800-1600"=1200,"1600-3200"=2400,"3200-6400"=4800,"More than 6400"=8000)

# helper
tap_ons <- function(x) {
  suppressWarnings({
    num <- as.numeric(x)
    is_num <- !is.na(num)
    out <- num
    out[!is_num] <- tap_bin_map[x[!is_num]]
    as.numeric(out)
  })
}

# 10 random normal days per year
valid_normal_dates <- opal_selected %>%
  filter(ti_region == "Sydney CBD", mode_name %in% c("Bus","Light rail","Train","Ferry"), !date %in% exclude_dates, mode_name != "UNKNOWN") %>%
  select(date, year) %>% distinct()
set.seed(42)
normal_sample <- valid_normal_dates %>% group_by(year) %>% summarise(date = sample(date, min(10, n())), .groups="drop")

# All-day avg
get_avg <- function(dates, label) {
  opal_selected %>%
    filter(ti_region == "Sydney CBD", mode_name == "Train", date %in% dates) %>%
    mutate(Tap_Ons_Est = tap_ons(Tap_Ons)) %>%
    group_by(year) %>%
    summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm=TRUE), .groups="drop") %>%
    mutate(period = label)
}

# All-day avg for normal and vivid
normal_days <- get_avg(normal_sample$date, "Normal")
vivid_days_df <- bind_rows(
  lapply(years, function(y) get_avg(vivid_days(y), "Vivid Sydney"))
)

compare_data_train <- bind_rows(normal_days, vivid_days_df) %>%
  mutate(period = factor(period, c("Normal", "Vivid Sydney")),
         avg_tap_ons = as.numeric(avg_tap_ons))

# All-day plot
plot_train_all_day <- ggplot(compare_data_train, aes(
    x = as.factor(year), y = avg_tap_ons, fill = period,
    tooltip = paste0("Year: ", year, "<br>Period: ", period, "<br>Avg Tap-Ons: ", comma(round(avg_tap_ons)))
  )) +
  geom_col_interactive(position=position_dodge(0.7), width=0.6) +
  geom_text(aes(label=comma(round(avg_tap_ons))), position=position_dodge(0.7), vjust=-0.3, size=3, color="black") +
  theme_minimal(base_size=13) +
  labs(title="Train: Average Tap-Ons (All Day)", x="Year", y="Average Tap-Ons (per day)") +
  scale_fill_manual(values=c("Normal"="#A9A9A9", "Vivid Sydney"="#0072B2")) +
  theme(legend.position="top", panel.grid.major.x=element_blank(), axis.text.x=element_text(face="bold"), plot.title=element_text(face="bold", size=15))

# 7–10pm avg
opal_cbd_evening <- opal_selected %>%
  filter(ti_region == "Sydney CBD", as.integer(tap_hour) %in% 19:21, mode_name == "Train") %>%
  mutate(Tap_Ons_Est = tap_ons(Tap_Ons))

vivid_dates_clean <- as.Date(vivid_dates, origin = "1970-01-01")
vivid_dates_clean <- vivid_dates_clean[!is.na(vivid_dates_clean)]
vivid_dates_clean <- unique(vivid_dates_clean)

samples_evening <- bind_rows(
  normal_sample %>% mutate(period="Normal"),
  tibble(date = vivid_dates_clean, year = as.integer(format(vivid_dates_clean, "%Y")), period = "Vivid Sydney")
)
summary_by_mode_evening <- samples_evening %>%
  left_join(opal_cbd_evening %>% group_by(date, year) %>% summarise(avg_tap_ons=mean(Tap_Ons_Est, na.rm=TRUE), .groups="drop"), by=c("date","year")) %>%
  filter(!is.na(avg_tap_ons)) %>%
  group_by(year, period) %>%
  summarise(avg_tap_ons=mean(avg_tap_ons, na.rm=TRUE), .groups="drop") %>%
  mutate(period=factor(period, c("Normal", "Vivid Sydney")), year=as.factor(year),
         avg_tap_ons = as.numeric(avg_tap_ons))

# 7–10pm plot
plot_train_evening <- ggplot(summary_by_mode_evening, aes(
    x=year, y=avg_tap_ons, fill=period, group=period,
    tooltip=paste0("Year: ", year, "<br>Avg Tap-Ons: ", comma(round(avg_tap_ons)))
  )) +
  geom_col_interactive(position=position_dodge(0.7), width=0.6) +
  geom_text(aes(label=comma(round(avg_tap_ons))), position=position_dodge(0.7), vjust=-0.3, size=3.5, color="black") +
  theme_minimal(base_size=14) +
  labs(title="Train: Average Tap-Ons (7–10pm)", x="Year", y="Average Tap-Ons (7–10pm)") +
  scale_fill_manual(values=c("Normal"="#A9A9A9", "Vivid Sydney"="#0072B2")) +
  theme(legend.position="top", panel.grid.major.x=element_blank(), axis.text.x=element_text(face="bold"), plot.title=element_text(face="bold", size=16))

3.1.1.1 All Day

girafe(ggobj = plot_train_all_day)

3.1.1.2 Evening (7-10pm)

girafe(ggobj = plot_train_evening)

3.2 Major Concerts: Taylor Swift, The Weeknd, Coldplay

In 2024, several high-profile concerts were held in Sydney, including Taylor Swift (February), The Weeknd (October), and Coldplay (December). The following analysis compares average train tap-ons on concert days to normal days, for both all-day and evening periods.

3.2.1 Average: Train Tap-Ons (All NSW, 2024)

# special dates
taylor_dates <- as.Date(c("2024-02-23", "2024-02-24", "2024-02-25", "2024-02-26"))
weeknd_dates <- as.Date(c("2024-10-22", "2024-10-23"))
coldplay_dates <- as.Date(c("2024-12-06", "2024-12-07", "2024-12-09", "2024-12-10"))
vivid_dates <- seq(as.Date("2024-05-24"), as.Date("2024-06-15"), by = "day")
public_holidays <- as.Date(c(
  # 2020
  "2020-01-01", "2020-01-27", "2020-04-10", "2020-04-12", "2020-04-13", "2020-04-25", "2020-06-08", "2020-10-05", "2020-12-25", "2020-12-28",
  # 2021
  "2021-01-01", "2021-01-26", "2021-04-02", "2021-04-04", "2021-04-05", "2021-04-25", "2021-06-14", "2021-10-04", "2021-12-25", "2021-12-27", "2021-12-28",
  # 2022
  "2022-01-01", "2022-01-26", "2022-04-15", "2022-04-17", "2022-04-18", "2022-04-25", "2022-06-13", "2022-10-03", "2022-12-25", "2022-12-26", "2022-12-27",
  # 2023
  "2023-01-01", "2023-01-26", "2023-04-07", "2023-04-09", "2023-04-10", "2023-04-25", "2023-06-12", "2023-10-02", "2023-12-25", "2023-12-26",
  # 2024
  "2024-01-01", "2024-01-26", "2024-03-29", "2024-03-31", "2024-04-01", "2024-04-25", "2024-06-10", "2024-10-07", "2024-12-25", "2024-12-26"
))
exclude_dates <- c(taylor_dates, weeknd_dates, coldplay_dates, vivid_dates, public_holidays)

tap_bin_map <- c("<50"=25,"50-100"=75,"100-200"=150,"200-400"=300,"400-800"=600,"800-1600"=1200,"1600-3200"=2400,"3200-6400"=4800,"More than 6400"=8000)
tap_ons <- function(x) {
  suppressWarnings({
    num <- as.numeric(x)
    is_num <- !is.na(num)
    out <- num
    out[!is_num] <- tap_bin_map[x[!is_num]]
    as.numeric(out)
  })
}

# prep of opal_selected for 2024 trains
opal_2024_train <- opal_selected %>%
  mutate(
    date = as.Date(trip_origin_date),
    year = as.factor(format(date, "%Y")),
    Tap_Ons_Est = tap_ons(Tap_Ons),
    tap_hour = as.integer(tap_hour)
  ) %>%
  filter(ti_region == "Sydney CBD", mode_name == "Train", year == "2024")

# All-Day
special_taylor_all <- opal_2024_train %>%
  filter(date %in% taylor_dates) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Concerts (Feb 23-26, 2024)")

special_weeknd_all <- opal_2024_train %>%
  filter(date %in% weeknd_dates) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Concerts (Oct 22-23, 2024)")

special_coldplay_all <- opal_2024_train %>%
  filter(date %in% coldplay_dates) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Concerts (Dec 6,7,9,10, 2024)")

normal_days_all <- opal_2024_train %>%
  filter(!date %in% exclude_dates) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Normal")

compare_data_train_all <- bind_rows(normal_days_all, special_taylor_all, special_weeknd_all, special_coldplay_all) %>%
  mutate(period = factor(period, c("Normal", "Concerts (Feb 23-26, 2024)", "Concerts (Oct 22-23, 2024)", "Concerts (Dec 6,7,9,10, 2024)")))

# 7-10pm
opal_2024_train_evening <- opal_2024_train %>%
  filter(tap_hour %in% 19:21)

special_taylor_pm <- opal_2024_train_evening %>%
  filter(date %in% taylor_dates) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Concerts (Feb 23-26, 2024)")

special_weeknd_pm <- opal_2024_train_evening %>%
  filter(date %in% weeknd_dates) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Concerts (Oct 22-23, 2024)")

special_coldplay_pm <- opal_2024_train_evening %>%
  filter(date %in% coldplay_dates) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Concerts (Dec 6,7,9,10, 2024)")

normal_days_pm <- opal_2024_train_evening %>%
  filter(!date %in% exclude_dates) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Normal")

compare_data_train_pm <- bind_rows(normal_days_pm, special_taylor_pm, special_weeknd_pm, special_coldplay_pm) %>%
  mutate(period = factor(period, c("Normal", "Concerts (Feb 23-26, 2024)", "Concerts (Oct 22-23, 2024)", "Concerts (Dec 6,7,9,10, 2024)")))

period_colors <- c(
  "Normal" = "#A9A9A9",
  "Concerts (Feb 23-26, 2024)" = "#EF553B",
  "Concerts (Oct 22-23, 2024)" = "#EF553B",
  "Concerts (Dec 6,7,9,10, 2024)" = "#EF553B"
)

# plot
plot_all_day <- ggplot(compare_data_train_all, aes(
    x = period, y = avg_tap_ons, fill = period,
    tooltip = paste0("Period: ", period, "<br>Avg Tap-Ons: ", comma(round(avg_tap_ons)))
  )) +
  geom_col_interactive(width = 0.6) +
  geom_text(aes(label = comma(round(avg_tap_ons))), vjust = -0.3, size = 3, color = "black") +
  theme_minimal(base_size = 13) +
  labs(
    title = "Average Train Tap-Ons (All Day, All - NSW, 2024)",
    x = "Period", y = "Average Tap-Ons (per day)"
  ) +
  scale_fill_manual(values = period_colors) +
  theme(
    legend.position = "none",
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(face = "bold"),
    plot.title = element_text(face = "bold", size = 15)
  )

plot_evening <- ggplot(compare_data_train_pm, aes(
    x = period, y = avg_tap_ons, fill = period,
    tooltip = paste0("Period: ", period, "<br>Avg Tap-Ons: ", comma(round(avg_tap_ons)))
  )) +
  geom_col_interactive(width = 0.6) +
  geom_text(aes(label = comma(round(avg_tap_ons))), vjust = -0.3, size = 3, color = "black") +
  theme_minimal(base_size = 13) +
  labs(
    title = "Average Train Tap-Ons (7–10pm, All - NSW, 2024)",
    x = "Period", y = "Average Tap-Ons (7–10pm)"
  ) +
  scale_fill_manual(values = period_colors) +
  theme(
    legend.position = "none",
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(face = "bold"),
    plot.title = element_text(face = "bold", size = 15)
  )

3.2.1.1 All Day

girafe(ggobj=plot_all_day)

3.2.1.2 Evening (7-10pm)

girafe(ggobj=plot_evening)

3.3 Public Holidays and Year-End Shutdown

Public holidays and the annual year-end shutdown (December 25 to January 4) also affect train ridership patterns. This section compares average train tap-ons during these periods to normal days, using data from 2020 to 2024.

# filter and prep train data
opal_trains <- opal_selected %>%
  mutate(
    date = as.Date(trip_origin_date),
    year = as.character(format(date, "%Y")),
    Tap_Ons_Est = tap_ons(Tap_Ons)
  ) %>%
  filter(ti_region == "Sydney CBD", mode_name == "Train", year %in% as.character(2020:2024))

# shutdown date ranges
shutdown_ranges <- list(
  "2020" = seq(as.Date("2020-12-25"), as.Date("2021-01-04"), by = "day"),
  "2021" = seq(as.Date("2021-12-25"), as.Date("2022-01-04"), by = "day"),
  "2022" = seq(as.Date("2022-12-25"), as.Date("2023-01-04"), by = "day"),
  "2023" = seq(as.Date("2023-12-25"), as.Date("2024-01-04"), by = "day"),
  "2024" = seq(as.Date("2024-12-25"), as.Date("2025-01-04"), by = "day")
)

shutdown_lookup <- bind_rows(
  lapply(names(shutdown_ranges), function(yr) {
    tibble(date = shutdown_ranges[[yr]], shutdown_year = yr)
  })
)

# join shutdown info
opal_trains_shutdown <- opal_trains %>%
  left_join(shutdown_lookup, by = "date")

special_shutdown <- opal_trains_shutdown %>%
  filter(!is.na(shutdown_year)) %>%
  group_by(year = shutdown_year) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE), .groups = "drop") %>%
  mutate(period = "Shutdown (Dec 25–Jan 4)")

normal_days <- opal_trains %>%
  filter(!date %in% exclude_dates) %>%
  group_by(year) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE), .groups = "drop") %>%
  mutate(period = "Normal")

special_holidays <- opal_trains %>%
  filter(date %in% public_holidays) %>%
  group_by(year) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE), .groups = "drop") %>%
  mutate(period = "Public Holidays")

compare_data_trains_shutdown <- bind_rows(normal_days, special_holidays, special_shutdown) %>%
  mutate(period = factor(period, c("Normal", "Public Holidays", "Shutdown (Dec 25–Jan 4)")))

# ensure complete grid
all_years <- as.character(2020:2024)
all_periods <- c("Normal", "Public Holidays", "Shutdown (Dec 25–Jan 4)")
complete_grid <- expand.grid(year = all_years, period = all_periods, stringsAsFactors = FALSE)

compare_data_trains_shutdown_complete <- complete_grid %>%
  left_join(compare_data_trains_shutdown, by = c("year", "period")) %>%
  mutate(
    period = factor(period, all_periods),
    year = factor(year, all_years)
  )

# colours
shutdown_colors <- c(
  "Normal" = "#A9A9A9",
  "Public Holidays" = "#EF553B",
  "Shutdown (Dec 25–Jan 4)" = "#999999"
)

# plot
g_shutdown <- ggplot(compare_data_trains_shutdown_complete, aes(
    x = year, 
    y = avg_tap_ons, 
    fill = period,
    tooltip = paste0("Year: ", year, "<br>Period: ", period, "<br>Avg Tap-Ons: ", ifelse(is.na(avg_tap_ons), "No data", comma(round(avg_tap_ons))))
  )) +
  geom_col_interactive(position = position_dodge(width = 0.7), width = 0.6, na.rm = TRUE) +
  geom_text(
    aes(label = ifelse(is.na(avg_tap_ons), "", comma(round(avg_tap_ons)))),
    position = position_dodge(width = 0.7),
    vjust = -0.3, size = 3, color = "black", show.legend = FALSE, na.rm = TRUE
  ) +
  theme_minimal(base_size = 13) +
  labs(
    title = "Average Train Tap-Ons (Sydney CBD, 2020–2024)",
    subtitle = "Normal days exclude Vivid, concerts, public holidays, and shutdown period",
    x = "Year", 
    y = "Average Tap-Ons (per day)"
  ) +
  scale_fill_manual(values = shutdown_colors) +
  theme(
    legend.position = "top",
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(face = "bold"),
    plot.title = element_text(face = "bold", size = 15),
    plot.subtitle = element_text(size = 11)
  )

girafe(ggobj = g_shutdown)

3.4 Comparative Impact of Events: Radar Chart

The following radar chart summarizes the average train tap-ons by period (Normal, Vivid Sydney, Public Holidays, Shutdown) for each year. This visualization provides a comparative overview of the impact of different event types on train ridership in the Sydney CBD.

# prep vivid sample days
set.seed(123)

vivid_sample_dates <- tibble(
  year = as.integer(2020:2023),
  vivid_date = c(
    sample(seq(as.Date("2020-05-22"), as.Date("2020-06-14"), by = "day"), 1),
    sample(seq(as.Date("2021-05-21"), as.Date("2021-06-12"), by = "day"), 1),
    sample(seq(as.Date("2022-05-27"), as.Date("2022-06-18"), by = "day"), 1),
    sample(seq(as.Date("2023-05-26"), as.Date("2023-06-17"), by = "day"), 1)
  )
)

vivid_sample_values <- vivid_sample_dates %>%
  left_join(opal_selected %>%
              filter(ti_region == "Sydney CBD", mode_name == "Train") %>%
              mutate(Tap_Ons_Est = tap_ons(Tap_Ons)),
            by = c("year", "vivid_date" = "date")) %>%
  group_by(year) %>%
  summarise(avg_tap_ons = mean(Tap_Ons_Est, na.rm = TRUE)) %>%
  mutate(period = "Vivid Sydney")

# 2024 vivid avg
vivid_2024 <- vivid_days_df %>%
  filter(year == 2024) %>%
  select(year, avg_tap_ons) %>%
  mutate(period = "Vivid Sydney")

vivid_days_all <- bind_rows(
  vivid_sample_values,
  vivid_2024
) %>%
  arrange(year) %>%
  mutate(year = as.character(year))

normal_days <- normal_days %>% mutate(year = as.character(year))
special_holidays <- special_holidays %>% mutate(year = as.character(year))
special_shutdown <- special_shutdown %>% mutate(year = as.character(year))

all_periods <- bind_rows(
  normal_days %>% mutate(period = "Normal"),
  vivid_days_all,
  special_holidays %>% mutate(period = "Public Holidays"),
  special_shutdown %>% mutate(period = "Shutdown")
)

# 1 row per year, cols for each period
radar_data <- all_periods %>%
  select(year, period, avg_tap_ons) %>%
  tidyr::pivot_wider(names_from = period, values_from = avg_tap_ons)

radar_data <- radar_data %>%
  arrange(year) %>%
  select(year, "Normal", "Vivid Sydney", "Public Holidays", "Shutdown")

# remove year col for fmsb
radar_data_fmsb <- radar_data %>% select(-year)
radar_data_fmsb[] <- lapply(radar_data_fmsb, function(x) as.numeric(as.character(x)))
radar_data_fmsb[is.na(radar_data_fmsb)] <- 0

# max/min rows for scaling
radar_data_fmsb <- rbind(
  apply(radar_data_fmsb, 2, max, na.rm=TRUE),
  apply(radar_data_fmsb, 2, min, na.rm=TRUE),
  radar_data_fmsb
)

year_labels <- as.character(radar_data$year)
rownames(radar_data_fmsb) <- c("Max", "Min", year_labels)

colors_border <- brewer.pal(max(3, length(year_labels)), "Set1")[1:length(year_labels)]

# plot
radarchart(radar_data_fmsb,
           axistype=1,
           pcol=colors_border,
           plwd=2,
           plty=1,
           cglcol="grey", cglty=1, axislabcol="grey", caxislabels=round(seq(0, max(radar_data_fmsb, na.rm=TRUE), length.out=5)), cglwd=0.8,
           vlcex=1.1,
           title="Yearly Average Train Tap-Ons by Period (Sydney CBD)"
)
legend("topright", legend=year_labels, bty="n", pch=20, col=colors_border, text.col="black", cex=1.1, pt.cex=2)

Inference: Public events and holidays cause sharp but short-lived increases or decreases in train ridership, with the most pronounced spikes during major concerts and Vivid Sydney. The radar chart highlights the relative magnitude and duration of these disruptions compared to normal periods.


4 Part 3: Weather Events and Ridership Fluctuations

Rain, Floods, and Recovery

  • Focus: Periods of significant rainfall (e.g., March 2022 floods)

  • Data: BOM and Opal

  • Analysis of train usage during extreme weather events

  • Visualization: Daily entries versus rainfall (dual axis plot)

Key Finding: Severe weather events caused temporary declines in train usage, with recovery typically occurring within days.

4.1 Heatwaves and Floods

4.1.1 Heatwaves

# filter cols and remove NAs
sydney_weather_filtered <- sydney_weather %>%
  select(Date, MaxTemp, region) %>%
  arrange(Date) %>%
  filter(!is.na(MaxTemp)) %>%
  mutate(HotDay = MaxTemp > 35)

rle_hot <- rle(sydney_weather_filtered$HotDay)
hot_streaks <- inverse.rle(rle_hot)
sydney_weather_filtered$HeatwaveGroup <- cumsum(c(TRUE, diff(hot_streaks) != 0))

# flag actual heatwave days (3+ hot days in a row)
sydney_weather_filtered$IsHeatwaveDay <- with(
  sydney_weather_filtered,
  ave(HotDay, HeatwaveGroup, FUN = function(x) all(x) & length(x) >= 3)
)

# keep heatwave days only
heatwave_days <- sydney_weather_filtered %>%
  filter(IsHeatwaveDay) %>%
  select(Date, MaxTemp, region, HeatwaveGroup) %>%
  mutate(tooltip = paste0(
    "Date: ", Date,
    "\nMax Temp: ", MaxTemp, "°C",
    "\nRegion: ", region
  ))

heatwave_plot <- ggplot(heatwave_days, aes(x = Date, y = MaxTemp, color = region)) +
  geom_point_interactive(aes(tooltip = tooltip), size = 3, alpha = 0.8) +
  geom_smooth(method = "loess", se = FALSE, size = 0.6, linetype = "dashed") +
  scale_y_continuous(name = "Max Temperature (°C)", limits = c(34, NA)) +
  scale_x_date(name = "Date") +
  labs(title = "Heatwave Days in Sydney by Region (2020–2025)", color = "Region") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")

girafe(ggobj = heatwave_plot)

4.1.2 Floods

# filter flood-risk days (Rainfall > 50mm)
flood_risk_days <- sydney_weather %>%
  filter(`Rainfall` > 50) %>%
  select(Date, `Rainfall`, region)

flood_risk_days <- flood_risk_days %>%
  arrange(Date, region)

# consecutive-day flood events with a FloodEvent ID
flood_risk_days <- flood_risk_days %>%
  group_by(region) %>%
  mutate(
    gap = c(0, as.numeric(diff(Date))),
    FloodEvent = cumsum(gap > 1) + 1
  ) %>%
  select(-gap, -FloodEvent) %>%
  ungroup()

flood_risk_days <- flood_risk_days %>%
  mutate(
    tooltip = paste0(
      "Date: ", Date,
      "\nRainfall: ", Rainfall, " mm"
    )
  )

# plot
flood_plot <- ggplot(flood_risk_days, aes(x = Date, y = Rainfall, color = region)) +
  geom_point_interactive(aes(tooltip = tooltip), size = 2, alpha = 0.8) +
  facet_wrap(~ region, ncol = 2, scales = "free_y") +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Flood-Risk Days by Region in Sydney (2020–2025)",
    x = "Date", y = "Rainfall (mm)",
    color = "Region"
  ) +
  theme_minimal(base_size = 13)

girafe(ggobj = flood_plot)

4.2 Train Usage During Extreme Weathers

4.2.1 Heatwaves

# analyze and tag daily train usage on heatwave vs non-heatwave days
all_train_data <- all_train_data %>%
  mutate(trip_origin_date = as.Date(trip_origin_date))

train_with_weather <- all_train_data %>%
  mutate(IsHeatwave = trip_origin_date %in% heatwave_days$Date)

heatwave_comparison <- train_with_weather %>%
  group_by(IsHeatwave) %>%
  summarise(
    avg_tap_ons = mean(Tap_Ons, na.rm = TRUE),
    median_tap_ons = median(Tap_Ons, na.rm = TRUE),
    count = n()
  )

print(heatwave_comparison)
## # A tibble: 2 × 4
##   IsHeatwave avg_tap_ons median_tap_ons  count
##   <lgl>            <dbl>          <dbl>  <int>
## 1 FALSE            6130.            600 456891
## 2 TRUE             6605.            700   9790
# plot
daily_tap_ons <- train_with_weather %>%
  group_by(trip_origin_date, IsHeatwave) %>%
  summarise(daily_total = sum(Tap_Ons, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    tooltip = paste0(
      "Date: ", trip_origin_date,
      "\nTotal Tap-Ons: ", daily_total
    )
  )

p <- ggplot(daily_tap_ons, aes(x = IsHeatwave, y = daily_total, fill = IsHeatwave)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.6) +
  geom_point_interactive(aes(tooltip = tooltip), position = position_jitter(width = 0.2), alpha = 0.3, color = "black") +
  labs(
    title = "Daily Train Usage on Heatwave vs Non-Heatwave Days",
    x = "Heatwave Day",
    y = "Total Train Tap-Ons (per day)"
  ) +
  scale_fill_manual(values = c("FALSE" = "#4B9CD3", "TRUE" = "#D55E00")) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

girafe(ggobj = p)

4.2.2 Floods

# add IsFlood col
train_with_weather <- train_with_weather %>%
  mutate(IsFlood = trip_origin_date %in% flood_risk_days$Date)

# summarise
flood_comparison <- train_with_weather %>%
  group_by(IsFlood) %>%
  summarise(
    avg_tap_ons = mean(Tap_Ons, na.rm = TRUE),
    median_tap_ons = median(Tap_Ons, na.rm = TRUE),
    count = n()
  )

print(flood_comparison)
## # A tibble: 2 × 4
##   IsFlood avg_tap_ons median_tap_ons  count
##   <lgl>         <dbl>          <dbl>  <int>
## 1 FALSE         6159.            600 457552
## 2 TRUE          5165.            500   9129
# plot
flood_comparison <- train_with_weather %>%
  mutate(IsFlood = trip_origin_date %in% flood_risk_days$Date) %>%
  group_by(IsFlood) %>%
  summarise(
    avg_tap_ons = mean(Tap_Ons, na.rm = TRUE),
    sd = sd(Tap_Ons, na.rm = TRUE),
    n = n(),
    se = sd / sqrt(n),
    .groups = "drop"
  ) %>%
  mutate(
    tooltip = paste0(
      "\nAvg Tap-Ons: ", round(avg_tap_ons),
      "\nSE: ", round(se, 1),
      "\nCount: ", n
    )
  )

flood_bar_plot <- ggplot(flood_comparison, aes(x = IsFlood, y = avg_tap_ons, fill = IsFlood)) +
  geom_col_interactive(aes(tooltip = tooltip), width = 0.6) +
  geom_errorbar(aes(ymin = avg_tap_ons - se, ymax = avg_tap_ons + se), width = 0.2) +
  labs(
    title = "Average Train Usage on Flood vs Non-Flood Days",
    x = "Is Flood Day?",
    y = "Average Train Tap-Ons"
  ) +
  scale_fill_manual(values = c("FALSE" = "#90CAF9", "TRUE" = "#F48FB1")) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

girafe(ggobj = flood_bar_plot)

5 References